/* Generates lbd_age1to5_2025.dta for Guo and Wallskog (2025) */

/* Firm-level dataset, where firm = firmid X state X naics2 */

libname x "pathname here";

%let miny = 2000;
%let miny2 = 2004;
%let miny3 = 2005;
%let maxy = 2015;

%macro byd(d); /* 2 vs. 3 digit */

/* Build up lbd */
proc delete data=lbd_all; run;
%macro byy;
  %do y=&miny. %to &maxy.;
    libname l "lbd pathname here";
    data lbd;
      set l.lbd&y.c_c201600(where=(emp>0 and pay>0 and emp~=. and pay~=.)); 
      naics&d. = substr(naics,1,&d.); /* filled in for 2003 - */
      year = &y.;
      /* if missing naics, take bestnaics */
      if naics="00000000" or naics="" then naics&d. = substr(bestnaics,1,&d.);
      keep emp year pay state firmid naics&d. firstyear lastyear lfo county;
      rename firstyear = lbdnum_firstyear;
      rename lastyear = lbdnum_lastyear;
      run;

    /* Convert nominal pay to real pay */
    proc sort data=lbd; by year; run;
    proc sort data=x.cpi; by year; run;
    data lbd;
      merge lbd(in=a) x.cpi;
      by year;
      if a=1;
      rpay = pay * cpi_mult;
      run;

    /* Within state-year-naics, aggregate to firm level */
    proc sort data=lbd; by firmid state naics&d.; run;
    proc means data=lbd noprint;
      var emp rpay lbdnum_firstyear lbdnum_lastyear;
      by firmid state naics&d.;
      output out=firms(drop=_:)
      sum = emp rpay
      N = num_estab
      min = _g1 _g2 lbd_firstyear_winyr
      max = _g3 _g4 _g5 lbd_lastyear_winyr;
      run;
    data firms;
      set firms;
      log_emp = log(emp);
      log_rpay = log(rpay);
      year = &y.;
      run;

     /* Find most-employment LFOs for each firmid-state-year-naics */
     proc sort data=lbd; by firmid state naics&d. lfo; run;
     proc means data=lbd noprint;
        var emp;
        by firmid state naics&d. lfo;
        output out=lfo_sum(drop=_:)
        sum = emp_lfo;
        run;
      data lfo; set lfo_sum;
        idv = catx("_",firmid,state,naics&d.);
        run;
      proc sort data=lfo; by idv descending emp_lfo; run;
      data lfo; set lfo;
        rank + 1;
        by idv;
        if first.idv then rank = 1;
        run;
        /* take top 5 within each firm */
        proc sort data=lfo; by firmid state naics&d.; run;
        data lfo_firm; set lfo(where=(rank=1));
          lfo_top1 = lfo;
          lfo_top1_emp = emp_lfo;
          keep firmid state naics&d. lfo_top1:;
          run;
        %macro byr;
          %do r=2 %to 5;
            data temp;
              set lfo(where=(rank=&r.));
              lfo_top&r. = lfo;
              lfo_top&r._emp = emp_lfo;
              keep firmid state naics&d. lfo_top&r.:;
              run;
          data lfo_firm;
            merge lfo_firm temp;
            by firmid state naics&d.;
            run;
          %end;
          %mend;
         %byr;
       /* merge in */
       data firms;
        merge firms lfo_firm;
        by firmid state naics&d.;
        run;

  /* Find most-employment county for each firmid-state-year-naics */
     proc sort data=lbd; by firmid state naics&d. county; run;
     proc means data=lbd noprint;
        var emp;
        by firmid state naics&d. county;
        output out=county_sum(drop=_:)
        sum = emp_county;
        run;
      data county; set county_sum;
        idv = catx("_",firmid,state,naics&d.);
        run;
      proc sort data=county; by idv descending emp_county; run;
      data county; set county;
        rank + 1;
        by idv;
        if first.idv then rank = 1;
        run;
        /* take top 5 within each firm */
        proc sort data=county; by firmid state naics&d.; run;
        data county_firm; set county(where=(rank=1));
          county_top1 = county;
          county_top1_emp = emp_county;
          keep firmid state naics&d. county_top1:;
          run;
        %macro byr;
          %do r=2 %to 5;
            data temp;
              set county(where=(rank=&r.));
              county_top&r. = county;
              county_top&r._emp = emp_county;
              keep firmid state naics&d. county_top&r.:;
              run;
          data county_firm;
            merge county_firm temp;
            by firmid state naics&d.;
            run;
          %end;
          %mend;
         %byr;
	/* total number of counties? */
	proc means data=county_sum noprint;
		var emp_county;	
		by firmid state naics&d.;
		output out=count_counties(drop=_:)
		N = number_counties;
		run;

       /* merge in */
       data firms;
        merge firms county_firm count_counties;
        by firmid state naics&d.;
        run;


    proc append data=firms base=lbd_all; run;
  %end;
%mend; %byy;


/* Flag - multi-state currently? multi-state previously? */
proc sort data=lbd_all; by firmid year; run;
%macro byy2;
  %do y=&miny. %to &maxy.;
      proc sort data=lbd_all(keep=year firmid state where=(year=&y.)) out=t nodupkey; by firmid state; run;
      proc means data=t noprint;
        var year;
        by firmid;
        output out=num_states_current(drop=_:)
        N = num_states_current;
        run;
      proc sort data=lbd_all(keep=year firmid state where=(year<=&y.)) out=t nodupkey; by firmid state; run;
      proc means data=t noprint;
        var year;
        by firmid;
        output out=num_states_since_&miny.(drop=_:)
        N = num_states_since_&miny.;
        run;
      data combo;
        merge num_states_current(in=a) num_states_since_&miny.;
        by firmid;
        if a=1;
        year=&y.;
        run;
      proc sort data=combo; by firmid year; run;
      data lbd_all;
        merge lbd_all(in=a) combo;
        by firmid year;
        if a=1;
        run;
  %end;
%mend; %byy2;

/* Find firm "age" - find first year firmid-ind pair appears in LBD within state
  NOTE: change from previous version: regardless of industry */
proc sort data=lbd_all; by firmid naics&d. state; run;
 /* first year, LBD definition */
  proc means data=lbd_all noprint;
    var lbd_firstyear_winyr lbd_lastyear_winyr;
    by firmid naics&d. state;
    output out=first_lbd(drop=_:)
    min=lbd_firstyear
    max=_g1 lbd_lastyear;
    run;
/* nationally new? */
 /* first year, LBD definition */
  proc means data=lbd_all noprint;
    var lbd_firstyear_winyr lbd_lastyear_winyr;
    by firmid;
    output out=first_lbd_nationally(drop=_:)
    min=lbd_firstyear_nat
    max=_g1 lbd_lastyear_nat;
    run;

data lbd_all;
  merge lbd_all(in=a) first_lbd;
  by firmid naics&d. state;
  if a=1;
  /*age = year - lbd_firstyear; */ /* = 0 in actual first year, = 1 in first year for sure with emp, etc. */
  age_ind_state = year - lbd_firstyear; /* = 0 in actual first year, = 1 in first year for sure with emp, etc. */
  rpay_pw = rpay/emp;
  log_rpay_pw = log(rpay_pw);
  multi_state_current = num_states_current>1;
  multi_state_since_&miny. = num_states_since_&miny.>1;
  run;

data lbd_all;
    merge lbd_all(in=a) first_lbd_nationally;
    by firmid;
    if a=1;
    age = year - lbd_firstyear_nat; /* = 0 in actual first year, = 1 in first year for sure with emp, etc. */
    run;

/* Flag: entering or exiting this year? */
proc sort data=lbd_all; by firmid state naics&d. year; run;

data prev; set lbd_all(keep= firmid state naics&d.  year emp); rename emp=emp_prev; year=year+1; run;
data next; set lbd_all(keep= firmid state naics&d.  year emp); rename emp=emp_next; year=year-1; run;

data lbd;
  merge lbd_all(in=a) prev(in=b) next(in=c);
  by firmid state naics&d. year;
  if a=1;
  enter = b=0;
  exit = c=0;
  if emp_prev ~=. then dhs_emp = (emp-emp_prev)/(.5*(emp+emp_prev));
    if emp_prev = . then dhs_emp = 2;
  run;

/* identify balanced sample */
proc sort data=lbd; by firmid naics&d. state; run;
%macro bya;
%do a=1 %to 5;
  data age&a.; set lbd(where=(age=&a.));
    survive_age&a. = 1; 
    emp&a. = emp;
    keep firmid naics&d. state survive_age&a. emp&a.; run;
%end; %mend; %bya;

data lbd;
  merge lbd(in=a) age1(in=a1) age2(in=a2) age3(in=a3) age4(in=a4) age5(in=a5);
  by firmid naics&d. state;
  if a=1;
  balanced = a1=1 and a2=1 and a3=1 and a4=1 and a5=1;
  if survive_age1 = . then survive_age1 = 0;
  if survive_age2 = . then survive_age2 = 0;
  if survive_age3 = . then survive_age3 = 0;
  if survive_age4 = . then survive_age4 = 0;
  if survive_age5 = . then survive_age5 = 0;
  if emp2 ~=. then dhs_emp_age_to_2 = (emp2-emp)/(.5*(emp2+emp));
    if emp2 = . then dhs_emp_age_to_2 = -2;
  if emp3 ~=. then dhs_emp_age_to_3 = (emp3-emp)/(.5*(emp3+emp));
    if emp3 = . then dhs_emp_age_to_3 = -2;
  if emp4 ~=. then dhs_emp_age_to_4 = (emp4-emp)/(.5*(emp4+emp));
    if emp4 = . then dhs_emp_age_to_4 = -2;
  if emp5 ~=. then dhs_emp_age_to_5 = (emp5-emp)/(.5*(emp5+emp));
    if emp5 = . then dhs_emp_age_to_5 = -2;
  run;

/* Export */
proc export data=lbd(where=(age<=5))
  outfile = "lbd_age1to5_2025.dta"
  replace;
  run;

%mend;
%byd(2);

